# Load libraries
library(tidyverse)
library(plotly)
library(future)
library(furrr)
library(tidybulk)
library(cluster)
library(proxy)
library(factoextra)
library(stringr)
# Load data
tt_all <- readRDS("intermediate_data/tt_all.rds")

tt_L3 <- tt_all %>% 
  pluck("tt", 3)
# select level of interest
LEVEL = "level_3"
# All functions

## 1 preprocess data

### 1.1 string manipulation that converts level of interest (e.g "level_5") to its ancestor level (e.g "level_4")
pre <- function(.level) {
  .level %>% 
    str_split("_") %>% 
    {as.numeric(.[[1]][2])-1} %>% 
    paste("level", ., sep = "_")
}

### 1.2 preprocess

preprocess <- function(.data, LEVEL) {
  
  # load data
  .data %>%
    
    tidybulk(sample, symbol, count) %>%
    
    # filter for the cell types of interest for gene marker selection
    filter(is.na(!!as.symbol(LEVEL))==F) %>%
    
    # Imputation of missing data within each level_5
    # impute_missing_abundance(~ !!as.symbol(LEVEL)) %>%
    
    # Group by ancestor
    nest(data = - !!as.symbol(pre(LEVEL))) %>%
    
    # Eliminate genes that are present in some but all cell types
    # (can be still present in a subset of samples from each cell-type)
    mutate(data = map(
      data,
      ~ .x %>%
        nest(data = -c(symbol, !!as.symbol(LEVEL))) %>%
        add_count(symbol) %>%
        filter(n == max(n)) %>%
        unnest(data)
    )) %>%
    
    # Imputation of missing data within each level_5
    mutate(data = map(data, ~ .x %>% impute_missing_abundance(~ !!as.symbol(LEVEL)))) %>%
    
    # scale count for further analysis
    mutate(data=map(data, ~ .x %>%
                      identify_abundant(factor_of_interest = !!as.symbol(LEVEL)) %>%
                      scale_abundance()
    ))
}

## 2 contrast functions

### 2.1 pairwise comparisons
get_contrasts_from_df = function(.data, LEVEL){
  
  .data %>% 
    distinct(!!as.symbol(LEVEL)) %>% 
    mutate(!!as.symbol(LEVEL) := paste0(LEVEL, !!as.symbol(LEVEL))) %>% 
    
    # Permute
    mutate(cell_type2 := !!as.symbol(LEVEL)) %>% 
    expand(!!as.symbol(LEVEL), cell_type2) %>% 
    filter(!!as.symbol(LEVEL) != cell_type2) %>% 
    
    # Create contrasts
    mutate(contrast = sprintf("%s - %s", !!as.symbol(LEVEL), cell_type2)) %>%
    pull(contrast)
  
}

### 2.2 create a contrast vector for limma::makeContrasts() or tidybulk::test_differential abundance()

mean_contrast <- function(.data, LEVEL){
  
  # find all cell types
  cell_types <- .data %>% 
    distinct(!!as.symbol(LEVEL)) %>% 
    pull() %>% 
    as.vector()
  
  # format cell_types with prefix
  cell_types <- paste0(LEVEL, cell_types)
  
  # initialise a vector called contrasts
  contrasts <- 1: length(cell_types)
  
  # create all contrasts and store them in contrasts
  for(i in 1:length(cell_types) ){
    background = paste(cell_types[-i], collapse = "+")
    divisor = length(cell_types[-i])
    contrasts[i] <- sprintf("%s-(%s)/%s", cell_types[i], background, divisor)
  }
  
  return(contrasts)
}


### 2.3 contrast with no average background

mean_contrasts2 <- function(.data, LEVEL){
  
  # find all cell types
  cell_types <- .data %>% 
    distinct(!!as.symbol(LEVEL)) %>% 
    pull() %>% 
    as.vector()
  
  # format cell_types with prefix
  cell_types <- paste0(LEVEL, cell_types)
  
  # initialise a vector called contrasts
  contrasts <- 1: length(cell_types)
  
  # create all contrasts and store them in contrasts
  for(i in 1: length(cell_types) ){
    background = paste(cell_types[-i], collapse = "+")
    contrasts[i] <- sprintf("%s-(%s)", cell_types[i], background)
  }
  
  return(contrasts)
}

## 3 marker ranking & selection

select_markers_for_each_contrast = function(.markers, sig_size){
  .markers %>%
    
    # Group by contrast. Comparisons both ways.
    pivot_longer(
      cols = contains("___"),
      names_to = c("stats", "contrast"), 
      values_to = ".value", 
      names_sep="___"
    ) %>% 
    
    # Markers selection within each pair of contrast
    nest(stat_df = -contrast) %>%
    
    # Reshape inside each contrast
    mutate(stat_df = map(stat_df, ~.x %>% pivot_wider(names_from = stats, values_from = .value))) %>%
    
    # Rank
    mutate(stat_df = map(stat_df, ~.x %>%
                           filter(FDR < 0.05 & logFC > 2) %>%
                           filter(logCPM > mean(logCPM)) %>%
                           arrange(logFC %>% desc()) %>%
                           slice(1:sig_size)
                         
    )) %>%
    
    unnest(stat_df)
}

## 4 marker collection for each contrast

contrast1 <- function(tt, LEVEL){
  tt %>%
    
    # Differential transcription
    mutate(markers = map(
      data,
      ~ test_differential_abundance(.x,
                                    ~ 0 + !!as.symbol(LEVEL), 
                                    .contrasts = get_contrasts_from_df(.x, LEVEL),
                                    action="only") 
    ))
}

contrast2 <- function(tt, LEVEL){
  tt %>%
    
    # Differential transcription
    mutate(markers = map(
      data,
      ~ test_differential_abundance(.x,
                                    ~ 0 + !!as.symbol(LEVEL), 
                                    .contrasts = mean_contrast(.x, LEVEL),
                                    action="only") 
    ))
}

sig_select <- function(.contrast, LEVEL, sig_size) {
  .contrast %>% 
    
    # Select markers from each contrast by rank of stats
    mutate(markers = map(markers, ~ select_markers_for_each_contrast(.x, sig_size))) %>%
    
    # Add original data info to the markers selected
    mutate(markers = map2(markers, data, ~ left_join(.x, .y))) %>%
    select(!!as.symbol(pre(LEVEL)), markers) %>%
    unnest(markers) %>%
    
    # make contrasts pretty
    mutate(contrast_pretty = str_replace(contrast, LEVEL, "") %>% str_replace(LEVEL, ""))
}


## 5 calculate the area of confidence ellipses and the sum of their areas

ellip_area <- function(.markers, LEVEL){
  
  # reduce dimension
  PCA <- .markers %>%
    distinct(sample, symbol, count_scaled, !!as.symbol(LEVEL)) %>% 
    reduce_dimensions(sample, symbol, count_scaled, method = "PCA", action = "add", transform = log1p)
  
  # number of unique symbols for each cell type
  real_size <- PCA %>% 
    nest(data=-!!as.symbol(LEVEL)) %>% 
    mutate(real_size=map_int(data, ~ n_distinct(.x$symbol)))
  
  area <- PCA %>%   
    # remove non-numerical data to form a numerical data frame
    select(!!as.symbol(LEVEL), PC1, PC2) %>%
    
    # normalize principle component values
    mutate(across(c("PC1", "PC2"), ~ .x %>% scale())) %>% 
    
    # nest by cell_type so as to calculate ellipse area for each cell type
    nest(PC = - !!as.symbol(LEVEL)) %>% 
    
    # obtain covariance matrix for each cell type
    mutate(cov = map(PC, ~ cov(.x))) %>% 
    
    # calculate the eigenvalues for the covariance matrix of each cell type
    mutate(eigval = map(cov, ~ eigen(.x)$values)) %>% 
    
    # transformation
    mutate(area = map(eigval, ~ sqrt(.x * qchisq(0.95, 2)))) %>%
    
    # below is the actual area for each ellipse
    mutate(area = map_dbl(area, ~ prod(.x)*pi)) %>% 
    
    # collect size of each cluster as factors for weights
    mutate(cluster_size = map_int(PC, ~ nrow(.x))) %>%
    
    # weight each area by the inverse of its cluster size
    mutate(weighted_area = map2_dbl(area, cluster_size, ~ .x / .y))
  
  left_join(area, real_size)
  
}

## 5.2 Ellipse area calculation for a series of sig_sizes
area_df_func <- function(.area_df, LEVEL){
  .area_df %>% 
    nest(markers = - !!as.symbol(pre(LEVEL))) %>% 
    mutate(ellip = map(markers, ~ .x %>% ellip_area(LEVEL)))
}

## 6 Silhouette score calculation for a series of sig_sizes
sil_func <- function(.sil_df, LEVEL){
  .sil_df %>%
    nest(pca = - !!as.symbol(pre(LEVEL))) %>%
    mutate(pca = map(pca, ~ .x %>%
                       distinct(sample, symbol, count_scaled, !!as.symbol(LEVEL)))) %>%
    mutate(pca = map(pca, ~ .x %>%
                       reduce_dimensions(sample, symbol, count_scaled,
                                         method = "PCA",
                                         action = "add",
                                         transform = log1p) %>% 
                       
                       # save symbols for calculating real_size while reducing replicated rows resulted from symbol
                       nest(data_symbol = c(symbol, count_scaled))
    )) %>%
    
    # calculate the dissimilarity matrix with PC values
    mutate(distance = map(pca, ~ .x %>%
                            select(contains("PC")) %>%
                            factoextra::get_dist(method = "euclidean")
    )) %>%
    
    # calculate silhouette score
    mutate(sil = map2(pca, distance,
                      ~ silhouette(as.numeric(as.factor(`$`(.x, !!as.symbol(LEVEL)))), .y)
    )) %>%
    mutate(sil = map(sil, ~ .x %>% summary())) %>%
    mutate(sil = map(sil, ~ .x %>% 
                       `$`(avg.width) ))%>% 
    mutate(sil = unlist(sil)) %>% 
    
    # obtain the actual number of signature genes
    mutate(real_size = map_int(pca, ~ .x$data_symbol %>% 
                                 map_int(~ n_distinct(.x$symbol)) %>% 
                                 unlist() %>% 
                                 unique() ))
  
}

1 Pre-analysis

1.1 Preprocess data

# 1 Setup data frame & preprocessing

# tt_L3 <- preprocess(counts, LEVEL)
# View cell types on level_1
tt_L3 %>% 
  unnest(data) %>% 
  select(!!as.symbol(pre(LEVEL))) %>% 
  distinct()
## # A tibble: 5 x 1
##   level_2       
##   <chr>         
## 1 mono_derived  
## 2 t_cell        
## 3 granulocyte   
## 4 b_cell        
## 5 natural_killer

1.2 Cluster without marker selection for contrast

# 2 No selection of markers
tt_naive_L3 <-
  tt_L3 %>%

  # Scale and reduce dimensions
  mutate(data = map(
    data,
    ~ .x %>%
      reduce_dimensions(method="PCA")
      # reduce_dimensions(method="MDS") %>%
      # reduce_dimensions(method="tSNE")
  ))
PCA_naive_mono <- tt_naive_L3 %>% 
  pluck("data", 1) %>% 
  ggplot(aes(x = PC1, y = PC2, colour = !!as.symbol(LEVEL), label = sample)) + 
  geom_point() +
  stat_ellipse(type = 't')+
  theme_bw()

PCA_naive_mono

PCA_naive_t <- tt_naive_L3 %>% 
  pluck("data", 2) %>% 
  ggplot(aes(x = PC1, y = PC2, colour = !!as.symbol(LEVEL), label = sample)) + 
  geom_point() +
  stat_ellipse(type = 't')+
  theme_bw()

PCA_naive_t

PCA_naive_granulocyte <- tt_naive_L3 %>% 
  pluck("data", 3) %>% 
  ggplot(aes(x = PC1, y = PC2, colour = !!as.symbol(LEVEL), label = sample)) + 
  geom_point() +
  stat_ellipse(type = 't')+
  theme_bw()

PCA_naive_granulocyte

PCA_naive_b <- tt_naive_L3 %>% 
  pluck("data", 4) %>% 
  ggplot(aes(x = PC1, y = PC2, colour = !!as.symbol(LEVEL), label = sample)) + 
  geom_point() +
  stat_ellipse(type = 't')+
  theme_bw()

PCA_naive_b

PCA_naive_NK <- tt_naive_L3 %>% 
  pluck("data", 5) %>% 
  ggplot(aes(x = PC1, y = PC2, colour = !!as.symbol(LEVEL), label = sample)) + 
  geom_point() +
  stat_ellipse(type = 't')+
  theme_bw()

PCA_naive_NK

2 Hierarchy + Pairwise Analysis

2.0 Generate pairwise contrast

contrast_L3 <- tt_all %>% 
  pluck("contrast1", 3)

2.1 Ellipse Area Analysis

sig_size <- 20

# create a tibble that stores the confidence ellipse area output for each signature size
area_tb <-
  tibble(sig_size = 1:sig_size) %>%
  # slice(1) %>%
  mutate(area_df = map(sig_size, ~ sig_select(contrast_L3, LEVEL, .x))) %>%
  mutate(area_df = map(area_df, ~ area_df_func(.x, LEVEL)))

# rescale areas and plot total areas vs the total number of markers selected from cell types in a level
area_data <- area_tb %>%
  
  unnest(area_df) %>%
  unnest(ellip) %>%
  
  # nest by ancestor cell type to rescale area for all sig_sizes
  nest(cell_data = - !!as.symbol(pre(LEVEL))) %>%
  mutate(cell_data = map(cell_data, ~ .x %>% mutate(rescaled_area = area %>% scale(center = F)))) %>%
  unnest(cell_data) %>%
  
  # nest by ancestor cell type to summarise areas for each real_size/sig_size
  nest(cell_data = - !!as.symbol(pre(LEVEL))) %>%
  mutate(plot_data = map(cell_data, ~ .x %>%
                           group_by(real_size) %>%
                           summarise(sig_size,
                                     stded_sum=sum(area, na.rm = T),
                                     wted_sum = sum(weighted_area, na.rm = T),
                                     rescaled_sum= sum(rescaled_area, na.rm = T)) %>%
                           pivot_longer(ends_with("sum"), names_to='area_type', values_to="area_value")
  ))

2.1.1 Signature size selection & PCA plot for monocyte_derived cell

Signature size selection plot for mono_derived cell:

mono_elli <- area_data %>% 
  pluck("plot_data", 1) %>% 
  ggplot(aes(real_size, area_value, colour=area_type)) + 
  geom_line() +
  geom_point() +
  # scale_x_continuous(sec.axis = sec_axis(as.factor())) +
  # facet_grid(rows = vars(area_type), scales = "free_y")
  facet_wrap(~ area_type, scales = "free_y")

mono_elli

The elbow point indicates optimal sig_size is \(5\). PCA at sig_size = \(5\):

sig_size <- 5

PCA_level3 <- contrast_L3 %>% 
  sig_select(LEVEL, sig_size) %>% 
  nest(pca = - !!as.symbol(pre(LEVEL))) %>% 
  mutate(pca = map(pca, ~ .x %>% 
                     distinct(sample, symbol, count_scaled, !!as.symbol(LEVEL)))) %>% 
  mutate(pca = map(pca, ~ .x %>% 
                     reduce_dimensions(
                       sample, symbol, count_scaled, 
                       method = "PCA", 
                       action="get", 
                       transform = log1p)))

PCA3_mono_elli <- PCA_level3 %>% 
  pluck("pca", 1) %>% 
  ggplot(aes(x = PC1, y = PC2, colour = !!as.symbol(LEVEL), label = sample)) + 
  geom_point() +
  stat_ellipse(type = 't')+
  theme_bw()

PCA3_mono_elli

2.1.2 Signature size selection & PCA plot for t cell

Signature size selection plot for t cell:

t_elli <- area_data %>% 
  pluck("plot_data", 2) %>% 
  ggplot(aes(real_size, area_value, colour=area_type)) + 
  geom_line() +
  geom_point() +
  # facet_grid(rows = vars(area_type), scales = "free_y")
  facet_wrap(~ area_type, scales = "free_y")

t_elli

The elbow point indicates optimal sig_size is \(4\). PCA at sig_size = \(4\):

sig_size <- 4

PCA_level3 <- contrast_L3 %>% 
  sig_select(LEVEL, sig_size) %>% 
  nest(pca = - !!as.symbol(pre(LEVEL))) %>% 
  mutate(pca = map(pca, ~ .x %>% 
                     distinct(sample, symbol, count_scaled, !!as.symbol(LEVEL)))) %>% 
  mutate(pca = map(pca, ~ .x %>% 
                     reduce_dimensions(
                       sample, symbol, count_scaled, 
                       method = "PCA", 
                       action="get", 
                       transform = log1p)))

PCA3_t_elli <- PCA_level3 %>% 
  pluck("pca", 2) %>% 
  ggplot(aes(x = PC1, y = PC2, colour = !!as.symbol(LEVEL), label = sample)) + 
  geom_point() +
  stat_ellipse(type = 't')+
  theme_bw()

PCA3_t_elli

2.1.3 Signature size selection & PCA plot for granulocyte

Signature size selection plot for granulocyte:

granulocyte_elli <- area_data %>% 
  pluck("plot_data", 3) %>% 
  ggplot(aes(real_size, area_value, colour=area_type)) + 
  geom_line() +
  geom_point() +
  # facet_grid(rows = vars(area_type), scales = "free_y")
  facet_wrap(~ area_type, scales = "free_y")

granulocyte_elli

The elbow point indicates optimal sig_size is \(2\). PCA at sig_size = \(2\):

sig_size <- 2

PCA_level3 <- contrast_L3 %>% 
  sig_select(LEVEL, sig_size) %>% 
  nest(pca = - !!as.symbol(pre(LEVEL))) %>% 
  mutate(pca = map(pca, ~ .x %>% 
                     distinct(sample, symbol, count_scaled, !!as.symbol(LEVEL)))) %>% 
  mutate(pca = map(pca, ~ .x %>% 
                     reduce_dimensions(
                       sample, symbol, count_scaled, 
                       method = "PCA", 
                       action="get", 
                       transform = log1p)))

PCA3_granulocyte <- PCA_level3 %>% 
  pluck("pca", 3) %>% 
  ggplot(aes(x = PC1, y = PC2, colour = !!as.symbol(LEVEL), label = sample)) + 
  geom_point() +
  stat_ellipse(type = 't')+
  theme_bw()

PCA3_granulocyte

2.1.4 Signature size selection & PCA plot for b cells

Signature size selection plot for b cell:

b_elli <- area_data %>% 
  pluck("plot_data", 4) %>% 
  ggplot(aes(real_size, area_value, colour=area_type)) + 
  geom_line() +
  geom_point() +
  # facet_grid(rows = vars(area_type), scales = "free_y")
  facet_wrap(~ area_type, scales = "free_y")

b_elli

The elbow point indicates optimal sig_size is \(4\). PCA at sig_size = \(4\):

sig_size <- 4

PCA_level3 <- contrast_L3 %>% 
  sig_select(LEVEL, sig_size) %>% 
  nest(pca = - !!as.symbol(pre(LEVEL))) %>% 
  mutate(pca = map(pca, ~ .x %>% 
                     distinct(sample, symbol, count_scaled, !!as.symbol(LEVEL)))) %>% 
  mutate(pca = map(pca, ~ .x %>% 
                     reduce_dimensions(
                       sample, symbol, count_scaled, 
                       method = "PCA", 
                       action="get", 
                       transform = log1p)))

PCA3_b <- PCA_level3 %>% 
  pluck("pca", 4) %>% 
  ggplot(aes(x = PC1, y = PC2, colour = !!as.symbol(LEVEL), label = sample)) + 
  geom_point() +
  stat_ellipse(type = 't')+
  theme_bw()

PCA3_b

2.1.5 Signature size selection & PCA plot for NK cell

Signature size selection plot for NK:

NK_elli <- area_data %>% 
  pluck("plot_data", 5) %>% 
  ggplot(aes(real_size, area_value, colour=area_type)) + 
  geom_line() +
  geom_point() +
  # facet_grid(rows = vars(area_type), scales = "free_y")
  facet_wrap(~ area_type, scales = "free_y")

NK_elli

The elbow point indicates optimal sig_size is \(4\). PCA at sig_size = \(4\):

sig_size <- 4

PCA_level3 <- contrast_L3 %>% 
  sig_select(LEVEL, sig_size) %>% 
  nest(pca = - !!as.symbol(pre(LEVEL))) %>% 
  mutate(pca = map(pca, ~ .x %>% 
                     distinct(sample, symbol, count_scaled, !!as.symbol(LEVEL)))) %>% 
  mutate(pca = map(pca, ~ .x %>% 
                     reduce_dimensions(
                       sample, symbol, count_scaled, 
                       method = "PCA", 
                       action="get", 
                       transform = log1p)))

PCA3_NK <- PCA_level3 %>% 
  pluck("pca", 5) %>% 
  ggplot(aes(x = PC1, y = PC2, colour = !!as.symbol(LEVEL), label = sample)) + 
  geom_point() +
  stat_ellipse(type = 't')+
  theme_bw()

PCA3_NK

2.2 Silhouette Analysis

sig_size <- 60

sil_tb <-
  tibble(sig_size = 1:sig_size) %>%
  # slice(1) %>%
  mutate(sil_df = map(sig_size, ~ sig_select(contrast_L3, LEVEL, .x))) %>%
  mutate(sil_df = map(sil_df, ~.x %>% sil_func(LEVEL)))

sil_data <- sil_tb %>%
  unnest(sil_df) %>%
  nest(plot_data = - !!as.symbol(pre(LEVEL)))

2.2.1 Signature size selection & PCA plot for monocyte_derived cell

Signature size selection plot for mono_derived cell:

mono_sil <- sil_data %>%
  pluck("plot_data", 1) %>% 
  ggplot(aes(real_size, sil)) +
  geom_line() +
  geom_point()

mono_sil

sil_data %>% 
  pluck("plot_data", 1) %>%
  pull(sil) %>% 
  which.max()
## [1] 60

The peak is reached when sig_size = \(11\). PCA at sig_size = \(11\):

sig_size <- 11

PCA_level3 <- contrast_L3 %>% 
  sig_select(LEVEL, sig_size) %>%
  nest(pca = - !!as.symbol(pre(LEVEL))) %>% 
  mutate(pca = map(pca, ~ .x %>% 
                     distinct(sample, symbol, count_scaled, !!as.symbol(LEVEL)))) %>% 
  mutate(pca = map(pca, ~ .x %>% 
                     reduce_dimensions(
                       sample, symbol, count_scaled, 
                       method = "PCA", 
                       action="get", 
                       transform = log1p)))

PCA3_mono_sil <- PCA_level3 %>% 
  pluck("pca", 1) %>% 
  ggplot(aes(x = PC1, y = PC2, colour = !!as.symbol(LEVEL), label = sample)) + 
  geom_point() +
  stat_ellipse(type = 't')+
  theme_bw()

PCA3_mono_sil

2.2.2 Signature size selection & PCA plot for t cell

Signature size selection plot for t cell:

t_sil <- sil_data %>%
  pluck("plot_data", 2) %>% 
  ggplot(aes(real_size, sil)) +
  geom_line() +
  geom_point()

t_sil

sil_data %>% 
  pluck("plot_data", 2) %>%
  pull(sil) %>% 
  which.max()
## [1] 5

The peak is reached when sig_size = \(5\). PCA at sig_size = \(5\):

sig_size <- 5

PCA_level3 <- contrast_L3 %>% 
  sig_select(LEVEL, sig_size) %>%
  nest(pca = - !!as.symbol(pre(LEVEL))) %>% 
  mutate(pca = map(pca, ~ .x %>% 
                     distinct(sample, symbol, count_scaled, !!as.symbol(LEVEL)))) %>% 
  mutate(pca = map(pca, ~ .x %>% 
                     reduce_dimensions(
                       sample, symbol, count_scaled, 
                       method = "PCA", 
                       action="get", 
                       transform = log1p)))

PCA3_t_sil <- PCA_level3 %>% 
  pluck("pca", 2) %>% 
  ggplot(aes(x = PC1, y = PC2, colour = !!as.symbol(LEVEL), label = sample)) + 
  geom_point() +
  stat_ellipse(type = 't')+
  theme_bw()

PCA3_t_sil

2.2.3 Signature size selection & PCA plot for granulocyte

Signature size selection plot for granulocyte:

granulocyte_sil <- sil_data %>%
  pluck("plot_data", 3) %>% 
  ggplot(aes(real_size, sil)) +
  geom_line() +
  geom_point()

granulocyte_sil

sil_data %>% 
  pluck("plot_data", 3) %>%
  pull(sil) %>% 
  which.max()
## [1] 1

The peak is reached when sig_size = \(1\). PCA at sig_size = \(1\):

sig_size <- 1

PCA_level3 <- contrast_L3 %>% 
  sig_select(LEVEL, sig_size) %>%
  nest(pca = - !!as.symbol(pre(LEVEL))) %>% 
  mutate(pca = map(pca, ~ .x %>% 
                     distinct(sample, symbol, count_scaled, !!as.symbol(LEVEL)))) %>% 
  mutate(pca = map(pca, ~ .x %>% 
                     reduce_dimensions(
                       sample, symbol, count_scaled, 
                       method = "PCA", 
                       action="get", 
                       transform = log1p)))

PCA3_granulocyte_sil <- PCA_level3 %>% 
  pluck("pca", 3) %>% 
  ggplot(aes(x = PC1, y = PC2, colour = !!as.symbol(LEVEL), label = sample)) + 
  geom_point() +
  stat_ellipse(type = 't')+
  theme_bw()

PCA3_granulocyte_sil

2.2.4 Signature size selection & PCA plot for b cell

Signature size selection plot for b cell:

b_sil <- sil_data %>%
  pluck("plot_data", 4) %>% 
  ggplot(aes(real_size, sil)) +
  geom_line() +
  geom_point()

b_sil

sil_data %>% 
  pluck("plot_data", 4) %>%
  pull(sil) %>% 
  which.max()
## [1] 4

The peak is reached when sig_size = \(4\). PCA at sig_size = \(4\):

sig_size <- 4

PCA_level3 <- contrast_L3 %>% 
  sig_select(LEVEL, sig_size) %>%
  nest(pca = - !!as.symbol(pre(LEVEL))) %>% 
  mutate(pca = map(pca, ~ .x %>% 
                     distinct(sample, symbol, count_scaled, !!as.symbol(LEVEL)))) %>% 
  mutate(pca = map(pca, ~ .x %>% 
                     reduce_dimensions(
                       sample, symbol, count_scaled, 
                       method = "PCA", 
                       action="get", 
                       transform = log1p)))

PCA3_b_sil <- PCA_level3 %>% 
  pluck("pca", 4) %>% 
  ggplot(aes(x = PC1, y = PC2, colour = !!as.symbol(LEVEL), label = sample)) + 
  geom_point() +
  stat_ellipse(type = 't')+
  theme_bw()

PCA3_b_sil

2.2.5 Signature size selection & PCA plot for NK

Signature size selection plot for NK:

NK_sil <- sil_data %>%
  pluck("plot_data", 5) %>% 
  ggplot(aes(real_size, sil)) +
  geom_line() +
  geom_point()

NK_sil

sil_data %>% 
  pluck("plot_data", 5) %>%
  pull(sil) %>% 
  which.max()
## [1] 15

The peak is reached when sig_size = \(15\). PCA at sig_size = \(15\):

sig_size <- 15

PCA_level3 <- contrast_L3 %>% 
  sig_select(LEVEL, sig_size) %>%
  nest(pca = - !!as.symbol(pre(LEVEL))) %>% 
  mutate(pca = map(pca, ~ .x %>% 
                     distinct(sample, symbol, count_scaled, !!as.symbol(LEVEL)))) %>% 
  mutate(pca = map(pca, ~ .x %>% 
                     reduce_dimensions(
                       sample, symbol, count_scaled, 
                       method = "PCA", 
                       action="get", 
                       transform = log1p)))

PCA3_NK_sil <- PCA_level3 %>% 
  pluck("pca", 5) %>% 
  ggplot(aes(x = PC1, y = PC2, colour = !!as.symbol(LEVEL), label = sample)) + 
  geom_point() +
  stat_ellipse(type = 't')+
  theme_bw()

PCA3_NK_sil

3 Hierarchy + mean contrast analysis

3.0 Generate mean contrast

mean_contrast_L3 <- tt_all %>%
  pluck("contrast2", 3)

3.1 Ellipse Area Analysis

sig_size <- 20

# create a tibble that stores the confidence ellipse area output for each signature size
area_tb <-
  tibble(sig_size = 1:sig_size) %>%
  # slice(1) %>%
  mutate(area_df = map(sig_size, ~ sig_select(mean_contrast_L3, LEVEL, .x))) %>%
  mutate(area_df = map(area_df, ~ area_df_func(.x, LEVEL)))

# rescale areas and plot total areas vs the total number of markers selected from cell types in a level
area_data <- area_tb %>%
  
  unnest(area_df) %>%
  unnest(ellip) %>%
  
  # nest by ancestor cell type to rescale area for all sig_sizes
  nest(cell_data = - !!as.symbol(pre(LEVEL))) %>%
  mutate(cell_data = map(cell_data, ~ .x %>% mutate(rescaled_area = area %>% scale(center = F)))) %>%
  unnest(cell_data) %>%
  
  # nest by ancestor cell type to summarise areas for each real_size/sig_size
  nest(cell_data = - !!as.symbol(pre(LEVEL))) %>%
  mutate(plot_data = map(cell_data, ~ .x %>%
                           group_by(real_size) %>%
                           summarise(sig_size,
                                     stded_sum=sum(area, na.rm = T),
                                     wted_sum = sum(weighted_area, na.rm = T),
                                     rescaled_sum= sum(rescaled_area, na.rm = T)) %>%
                           pivot_longer(ends_with("sum"), names_to='area_type', values_to="area_value")
  ))

3.1.1 Signature size selection & PCA plot for monocyte derived cells

Signature size selection plot for mono_derived cell:

mono_elli <- area_data %>% 
  pluck("plot_data", 1) %>% 
  ggplot(aes(real_size, area_value, colour=area_type)) + 
  geom_line() +
  geom_point() +
  # scale_x_continuous(sec.axis = sec_axis(as.factor())) +
  # facet_grid(rows = vars(area_type), scales = "free_y")
  facet_wrap(~ area_type, scales = "free_y")

mono_elli

The elbow point indicates optimal sig_size is \(7\). PCA at sig_size = \(7\):

sig_size <- 7

PCA_level3 <- mean_contrast_L3 %>% 
  sig_select(LEVEL, sig_size) %>%
  nest(pca = - !!as.symbol(pre(LEVEL))) %>% 
  mutate(pca = map(pca, ~ .x %>% 
                     distinct(sample, symbol, count_scaled, !!as.symbol(LEVEL)))) %>% 
  mutate(pca = map(pca, ~ .x %>% 
                     reduce_dimensions(
                       sample, symbol, count_scaled, 
                       method = "PCA", 
                       action="get", 
                       transform = log1p)))


PCA3_mono_elli <- PCA_level3 %>% 
  pluck("pca", 1) %>% 
  ggplot(aes(x = PC1, y = PC2, colour = !!as.symbol(LEVEL), label = sample)) + 
  geom_point() +
  stat_ellipse(type = 't')+
  theme_bw()

PCA3_mono_elli

3.1.2 Signature size selection & PCA plot for t cell

Signature size selection plot for t cell:

t_elli <- area_data %>% 
  pluck("plot_data", 2) %>% 
  ggplot(aes(real_size, area_value, colour=area_type)) + 
  geom_line() +
  geom_point() +
  # facet_grid(rows = vars(area_type), scales = "free_y")
  facet_wrap(~ area_type, scales = "free_y")

t_elli

The elbow point indicates optimal sig_size is \(7\). PCA at sig_size = \(7\):

sig_size <- 7

PCA_level3 <- mean_contrast_L3 %>% 
  sig_select(LEVEL, sig_size) %>%
  nest(pca = - !!as.symbol(pre(LEVEL))) %>% 
  mutate(pca = map(pca, ~ .x %>% 
                     distinct(sample, symbol, count_scaled, !!as.symbol(LEVEL)))) %>% 
  mutate(pca = map(pca, ~ .x %>% 
                     reduce_dimensions(
                       sample, symbol, count_scaled, 
                       method = "PCA", 
                       action="get", 
                       transform = log1p)))


PCA3_t_elli <- PCA_level3 %>% 
  pluck("pca", 2) %>% 
  ggplot(aes(x = PC1, y = PC2, colour = !!as.symbol(LEVEL), label = sample)) + 
  geom_point() +
  stat_ellipse(type = 't')+
  theme_bw()

PCA3_t_elli

3.1.3 Signature size selection & PCA plot for granulocytes

Signature size selection plot for granulocytes:

granulocyte_elli <- area_data %>% 
  pluck("plot_data", 3) %>% 
  ggplot(aes(real_size, area_value, colour=area_type)) + 
  geom_line() +
  geom_point() +
  # facet_grid(rows = vars(area_type), scales = "free_y")
  facet_wrap(~ area_type, scales = "free_y")

granulocyte_elli

The elbow point indicates optimal sig_size is \(2\). PCA at sig_size = \(2\):

sig_size <- 2

PCA_level3 <- mean_contrast_L3 %>% 
  sig_select(LEVEL, sig_size) %>%
  nest(pca = - !!as.symbol(pre(LEVEL))) %>% 
  mutate(pca = map(pca, ~ .x %>% 
                     distinct(sample, symbol, count_scaled, !!as.symbol(LEVEL)))) %>% 
  mutate(pca = map(pca, ~ .x %>% 
                     reduce_dimensions(
                       sample, symbol, count_scaled, 
                       method = "PCA", 
                       action="get", 
                       transform = log1p)))


PCA3_granulocyte__elli <- PCA_level3 %>% 
  pluck("pca", 3) %>% 
  ggplot(aes(x = PC1, y = PC2, colour = !!as.symbol(LEVEL), label = sample)) + 
  geom_point() +
  stat_ellipse(type = 't')+
  theme_bw()

PCA3_granulocyte__elli

3.1.4 Signature size selection & PCA plot for b cell

Signature size selection plot for b cells:

b_elli <- area_data %>% 
  pluck("plot_data", 4) %>% 
  ggplot(aes(real_size, area_value, colour=area_type)) + 
  geom_line() +
  geom_point() +
  # facet_grid(rows = vars(area_type), scales = "free_y")
  facet_wrap(~ area_type, scales = "free_y")

b_elli

The elbow point indicates optimal sig_size is \(4\). PCA at sig_size = \(4\):

sig_size <- 4

PCA_level3 <- mean_contrast_L3 %>% 
  sig_select(LEVEL, sig_size) %>%
  nest(pca = - !!as.symbol(pre(LEVEL))) %>% 
  mutate(pca = map(pca, ~ .x %>% 
                     distinct(sample, symbol, count_scaled, !!as.symbol(LEVEL)))) %>% 
  mutate(pca = map(pca, ~ .x %>% 
                     reduce_dimensions(
                       sample, symbol, count_scaled, 
                       method = "PCA", 
                       action="get", 
                       transform = log1p)))


PCA3_b__elli <- PCA_level3 %>% 
  pluck("pca", 4) %>% 
  ggplot(aes(x = PC1, y = PC2, colour = !!as.symbol(LEVEL), label = sample)) + 
  geom_point() +
  stat_ellipse(type = 't')+
  theme_bw()

PCA3_b__elli

3.1.5 Signature size selection & PCA plot for NK cells

Signature size selection plot for NK:

NK_elli <- area_data %>% 
  pluck("plot_data", 5) %>% 
  ggplot(aes(real_size, area_value, colour=area_type)) + 
  geom_line() +
  geom_point() +
  # facet_grid(rows = vars(area_type), scales = "free_y")
  facet_wrap(~ area_type, scales = "free_y")

NK_elli

The elbow point indicates optimal sig_size is \(4\). PCA at sig_size = \(4\):

sig_size <- 4

PCA_level3 <- mean_contrast_L3 %>% 
  sig_select(LEVEL, sig_size) %>%
  nest(pca = - !!as.symbol(pre(LEVEL))) %>% 
  mutate(pca = map(pca, ~ .x %>% 
                     distinct(sample, symbol, count_scaled, !!as.symbol(LEVEL)))) %>% 
  mutate(pca = map(pca, ~ .x %>% 
                     reduce_dimensions(
                       sample, symbol, count_scaled, 
                       method = "PCA", 
                       action="get", 
                       transform = log1p)))


PCA3_NK_elli <- PCA_level3 %>% 
  pluck("pca", 5) %>% 
  ggplot(aes(x = PC1, y = PC2, colour = !!as.symbol(LEVEL), label = sample)) + 
  geom_point() +
  stat_ellipse(type = 't')+
  theme_bw()

PCA3_NK_elli

3.2 Silhouette Analysis

sig_size <- 60

sil_tb <-
  tibble(sig_size = 1:sig_size) %>%
  # slice(1) %>%
  mutate(sil_df = map(sig_size, ~ sig_select(mean_contrast_L3, LEVEL, .x))) %>%
  mutate(sil_df = map(sil_df, ~.x %>% sil_func(LEVEL)))

sil_data <- sil_tb %>%
  unnest(sil_df) %>%
  nest(plot_data = - !!as.symbol(pre(LEVEL)))

3.2.1 Signature size selection & PCA plot for monocyte derived cells

Signature size selection plot for mono_derived cell:

mono_sil <- sil_data %>%
  pluck("plot_data", 1) %>% 
  ggplot(aes(real_size, sil)) +
  geom_line() +
  geom_point()

mono_sil

sil_data %>% 
  pluck("plot_data", 1) %>%
  pull(sil) %>% 
  which.max()
## [1] 29

The peak is reached when sig_size = \(29\). PCA at sig_size = \(29\), \ but silhouette score has only minimal improvement than that when sig_size = \(17\):

sig_size <- 17

PCA_level3 <- mean_contrast_L3 %>% 
  sig_select(LEVEL, sig_size) %>%
  nest(pca= - !!as.symbol(pre(LEVEL))) %>% 
  mutate(pca = map(pca, ~ .x %>% 
                     distinct(sample, symbol, count_scaled, !!as.symbol(LEVEL)))) %>% 
  mutate(pca = map(pca, ~ .x %>% 
                     reduce_dimensions(
                       sample, symbol, count_scaled, 
                       method = "PCA", 
                       action="get", 
                       transform = log1p)))

PCA3_mono_sil <- PCA_level3 %>% 
  pluck("pca", 1) %>% 
  ggplot(aes(x = PC1, y = PC2, colour = !!as.symbol(LEVEL), label = sample)) + 
  geom_point() +
  stat_ellipse(type = 't')+
  theme_bw()

PCA3_mono_sil

3.2.2 Signature size selection & PCA plot for t cells

Signature size selection plot for t cell:

t_sil <- sil_data %>%
  pluck("plot_data", 2) %>% 
  ggplot(aes(real_size, sil)) +
  geom_line() +
  geom_point()

t_sil

sil_data %>% 
  pluck("plot_data", 2) %>%
  pull(sil) %>% 
  which.max()
## [1] 31

The peak is reached when sig_size = \(26\). PCA at sig_size = \(26\):

sig_size <- 26

PCA_level3 <- mean_contrast_L3 %>% 
  sig_select(LEVEL, sig_size) %>%
  nest(pca= - !!as.symbol(pre(LEVEL))) %>% 
  mutate(pca = map(pca, ~ .x %>% 
                     distinct(sample, symbol, count_scaled, !!as.symbol(LEVEL)))) %>% 
  mutate(pca = map(pca, ~ .x %>% 
                     reduce_dimensions(
                       sample, symbol, count_scaled, 
                       method = "PCA", 
                       action="get", 
                       transform = log1p)))

PCA3_t_sil <- PCA_level3 %>% 
  pluck("pca", 2) %>% 
  ggplot(aes(x = PC1, y = PC2, colour = !!as.symbol(LEVEL), label = sample)) + 
  geom_point() +
  stat_ellipse(type = 't')+
  theme_bw()

PCA3_t_sil

3.2.3 Signature size selection & PCA plot for granulocytes

Signature size selection plot for granulocytes:

granulocyte_sil <- sil_data %>%
  pluck("plot_data", 3) %>% 
  ggplot(aes(real_size, sil)) +
  geom_line() +
  geom_point()

granulocyte_sil

sil_data %>% 
  pluck("plot_data", 3) %>%
  pull(sil) %>% 
  which.max()
## [1] 1

The peak is reached when sig_size = \(1\). PCA at sig_size = \(1\):

sig_size <- 1

PCA_level3 <- mean_contrast_L3 %>% 
  sig_select(LEVEL, sig_size) %>%
  nest(pca= - !!as.symbol(pre(LEVEL))) %>% 
  mutate(pca = map(pca, ~ .x %>% 
                     distinct(sample, symbol, count_scaled, !!as.symbol(LEVEL)))) %>% 
  mutate(pca = map(pca, ~ .x %>% 
                     reduce_dimensions(
                       sample, symbol, count_scaled, 
                       method = "PCA", 
                       action="get", 
                       transform = log1p)))

PCA3_granulocyte_sil <- PCA_level3 %>% 
  pluck("pca", 3) %>% 
  ggplot(aes(x = PC1, y = PC2, colour = !!as.symbol(LEVEL), label = sample)) + 
  geom_point() +
  stat_ellipse(type = 't')+
  theme_bw()

PCA3_granulocyte_sil

3.2.4 Signature size selection & PCA plot for b cells

Signature size selection plot for b cells:

b_sil <- sil_data %>%
  pluck("plot_data", 4) %>% 
  ggplot(aes(real_size, sil)) +
  geom_line() +
  geom_point()

b_sil

sil_data %>% 
  pluck("plot_data", 4) %>%
  pull(sil) %>% 
  which.max()
## [1] 4

The peak is reached when sig_size = \(4\). PCA at sig_size = \(4\):

sig_size <- 4

PCA_level3 <- mean_contrast_L3 %>% 
  sig_select(LEVEL, sig_size) %>%
  nest(pca= - !!as.symbol(pre(LEVEL))) %>% 
  mutate(pca = map(pca, ~ .x %>% 
                     distinct(sample, symbol, count_scaled, !!as.symbol(LEVEL)))) %>% 
  mutate(pca = map(pca, ~ .x %>% 
                     reduce_dimensions(
                       sample, symbol, count_scaled, 
                       method = "PCA", 
                       action="get", 
                       transform = log1p)))

PCA3_b_sil <- PCA_level3 %>% 
  pluck("pca", 4) %>% 
  ggplot(aes(x = PC1, y = PC2, colour = !!as.symbol(LEVEL), label = sample)) + 
  geom_point() +
  stat_ellipse(type = 't')+
  theme_bw()

PCA3_b_sil

3.2.5 Signature size selection & PCA plot for granulocytes

Signature size selection plot for NK cells:

NK_sil <- sil_data %>%
  pluck("plot_data", 5) %>% 
  ggplot(aes(real_size, sil)) +
  geom_line() +
  geom_point()

NK_sil

sil_data %>% 
  pluck("plot_data", 5) %>%
  pull(sil) %>% 
  which.max()
## [1] 15

The peak is reached when sig_size = \(15\). PCA at sig_size = \(15\):

sig_size <- 15

PCA_level3 <- mean_contrast_L3 %>% 
  sig_select(LEVEL, sig_size) %>%
  nest(pca= - !!as.symbol(pre(LEVEL))) %>% 
  mutate(pca = map(pca, ~ .x %>% 
                     distinct(sample, symbol, count_scaled, !!as.symbol(LEVEL)))) %>% 
  mutate(pca = map(pca, ~ .x %>% 
                     reduce_dimensions(
                       sample, symbol, count_scaled, 
                       method = "PCA", 
                       action="get", 
                       transform = log1p)))

PCA3_NK_sil <- PCA_level3 %>% 
  pluck("pca", 5) %>% 
  ggplot(aes(x = PC1, y = PC2, colour = !!as.symbol(LEVEL), label = sample)) + 
  geom_point() +
  stat_ellipse(type = 't')+
  theme_bw()

PCA3_NK_sil